# ===========================================================================
# QC-Bench Local Evaluation Script
# ===========================================================================
# This script runs the second part of our evaluation process, testing open-source
# models available via Hugging Face on our quantum computing benchmark.
#
# The experiments were conducted on a local cluster equipped with two
# Tesla V100 GPUs (32GB each) using FP16 inference. This approach was necessary
# for models that weren't accessible through API services and needed to be
# evaluated directly using the Transformers library.
#
# Files needed to run:
# - qc200.json, qc1000.json, qc5184.json (for benchmark subsets)
# - basic_concepts.json, gates_and_circuit_design.json, qml.json, security.json,
#   error_correction.json, algorithms.json, distributed_computing.json (for topic benchmarks)

import json
import torch
import re
import gc
import signal
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM

# Timeout setup
class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException("Timed out while loading or evaluating the model.")

signal.signal(signal.SIGALRM, timeout_handler)

model_names = [
    "ibm-granite/granite-3.3-8b-instruct",
    "microsoft/Phi-4-mini-reasoning",
    "microsoft/Phi-4-reasoning",
    "microsoft/Phi-4-reasoning-plus",
    "microsoft/phi-2",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
    "HuggingFaceH4/zephyr-7b-beta",
    "meta-llama/Llama-2-13b-chat-hf",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.1-8B",
    "meta-llama/Llama-3.1-8B-Instruct",
    "google/gemma-7b",
    "google/gemma-2-2b-it",
    "EleutherAI/gpt-j-6b",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "databricks/dolly-v1-6b",
    "Qwen/Qwen1.5-MoE-A2.7B"
]

json_files = [
"qml.json"
    
]

# Prepare a structure to hold accuracies: accuracies[model_name][json_file] = accuracy
accuracies = {m: {} for m in model_names}

for model_name in model_names:
    print(f"\nLoading model {model_name}...")
    try:
        signal.alarm(3600)  # 1-hour timeout for loading
        torch.cuda.empty_cache()
        gc.collect()

        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16,
            max_memory={i: "30GiB" for i in range(torch.cuda.device_count())},
            trust_remote_code=True,
            low_cpu_mem_usage=True
        )
        model.eval()
        signal.alarm(0)

    except TimeoutException:
        print(f"{model_name} failed to load in 1h; skipping.")
        for jf in json_files:
            accuracies[model_name][jf] = "LOAD TIMEOUT"
        continue
    except Exception as e:
        msg = str(e).splitlines()[0]
        print(f"{model_name} load error: {msg}; skipping.")
        for jf in json_files:
            accuracies[model_name][jf] = f"LOAD ERROR: {msg}"
        continue

    # if loaded successfully, evaluate on each JSON
    for json_file in json_files:
        try:
            with open(json_file, "r") as f:
                questions = json.load(f)
        except Exception as e:
            print(f"  {json_file}: failed to load ({e})")
            accuracies[model_name][json_file] = "JSON LOAD ERROR"
            continue

        correct = 0
        total = len(questions)
        signal.alarm(3600)  # 1-hour timeout for evaluation

        for q in questions:
            prompt = (
                f"You are an expert in quantum computing. Choose the correct answer to the question below.\n\n"
                f"Question: {q['question']}\n"
                f"A. {q['A']}\n"
                f"B. {q['B']}\n"
                f"C. {q['C']}\n"
                f"D. {q['D']}\n\n"
                "Answer with only one letter: A, B, C, or D.\n\n"
                "Answer:"
            )
            try:
                inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=5,
                        do_sample=False,
                        temperature=0.0,
                        pad_token_id=tokenizer.eos_token_id
                    )
                decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
                match = re.search(r"Answer:\s*([A-D])", decoded, re.IGNORECASE)
                pred = match.group(1).upper() if match else "?"
                if pred == q.get("solution"):
                    correct += 1
            except TimeoutException:
                print(f"    Evaluation on {json_file} timed out; marking as TIMEOUT")
                correct = None
                break
            except Exception:
                # skip this question on error
                continue

        signal.alarm(0)

        if correct is None:
            accuracies[model_name][json_file] = "EVAL TIMEOUT"
        else:
            accuracies[model_name][json_file] = f"{correct/total:.2%}"
            print(f"  {json_file}: {correct}/{total} = {accuracies[model_name][json_file]}")

    # clean up before next model
    del model, tokenizer
    torch.cuda.empty_cache()
    gc.collect()

# Print detailed summary
print("\n=== DETAILED ACCURACY SUMMARY ===")
for model_name, results in accuracies.items():
    print(f"\nModel: {model_name}")
    for json_file, acc in results.items():
        print(f"  {json_file}: {acc}")

# Save results to JSON
with open("huggingface_results.json", "w") as f:
    json.dump(accuracies, f, indent=2)